#-*- codeing = utf-8 -*-
#@Time : 2020/10/30 12:45
#@Author : 阳某
#@File : 读取word.py
#@Software : PyCharm



import docx

document = docx.Document('Python（计算机程序设计语言）.docx')

# for para in document.paragraphs:
#     print(para.text)

content = ' '.join([para.text for para in document.paragraphs])
print(len(content))

import jieba
seg_list = jieba.cut(content,cut_all=False)
print(type(seg_list))

# 过滤标点符号，无意义的单个字
seg_list = [
    word
    for word in seg_list
    if len(word)>1
]
print(seg_list[:30])

# 统计词频
from collections import Counter

counter = Counter(seg_list)
for key,count in list(counter.items())[:10]:
    print(key,count)

# 构造pandas并且排序
import pandas as pd

df = pd.DataFrame(list(counter.items()),columns=['word','count'])
print(df.head())
df.sort_values(by='count',ascending=False,inplace=True)
print(df.head())
# 输出到excel文件
df.to_excel('result.xlsx')
